import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings("ignore")
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
D:\anaconda files\lib\site-packages\scipy\__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.4
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
data = ("C:\\Users\\laxma\\Downloads\\netflix.csv")
data = pd.read_csv(data)
data.head()
| title | genre | language | imdb_score | premiere | runtime | year | |
|---|---|---|---|---|---|---|---|
| 0 | Notes for My Son | Drama | Spanish | 6.3 | 11/24/2020 | 83 | 2020 |
| 1 | To Each, Her Own | Romantic comedy | French | 5.3 | 6/24/2018 | 95 | 2018 |
| 2 | The Lovebirds | Romantic comedy | English | 6.1 | 5/22/2020 | 87 | 2020 |
| 3 | The Perfection | Horror-thriller | English | 6.1 | 5/24/2019 | 90 | 2019 |
| 4 | Happy Anniversary | Romantic comedy | English | 5.8 | 3/30/2018 | 78 | 2018 |
data.shape
(583, 7)
col_names = ['Name','classification','language type','rating','Relase date','Duration','year']
data.columns = col_names
col_names
['Name', 'classification', 'language type', 'rating', 'Relase date', 'Duration', 'year']
data.head()
| Name | classification | language type | rating | Relase date | Duration | year | |
|---|---|---|---|---|---|---|---|
| 0 | Notes for My Son | Drama | Spanish | 6.3 | 11/24/2020 | 83 | 2020 |
| 1 | To Each, Her Own | Romantic comedy | French | 5.3 | 6/24/2018 | 95 | 2018 |
| 2 | The Lovebirds | Romantic comedy | English | 6.1 | 5/22/2020 | 87 | 2020 |
| 3 | The Perfection | Horror-thriller | English | 6.1 | 5/24/2019 | 90 | 2019 |
| 4 | Happy Anniversary | Romantic comedy | English | 5.8 | 3/30/2018 | 78 | 2018 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 583 entries, 0 to 582 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 583 non-null object 1 classification 583 non-null object 2 language type 583 non-null object 3 rating 583 non-null float64 4 Relase date 583 non-null object 5 Duration 583 non-null int64 6 year 583 non-null int64 dtypes: float64(1), int64(2), object(4) memory usage: 32.0+ KB
col_names = ['Name','classification','language type','rating','Relase date','Duration','year']
for col in col_names:
print(data[col].value_counts())
Notes for My Son 1
ReMastered: The Miami Showband Massacre 1
The Day of the Lord 1
Milestone 1
Sardar Ka Grandson 1
..
To All the Boys I've Loved Before 1
Tribhanga ? Tedhi Medhi Crazy 1
Team Foxcatcher 1
The Players 1
Biggie: I Got a Story to Tell 1
Name: Name, Length: 583, dtype: int64
Documentary 159
Drama 77
Comedy 49
Romantic comedy 39
Thriller 33
...
Political thriller 1
Fantasy 1
Romantic comedy-drama 1
Animation/Musical/Adventure 1
Supernatural drama 1
Name: classification, Length: 114, dtype: int64
English 401
Hindi 32
Spanish 31
French 20
Italian 14
Portuguese 12
Indonesian 9
Korean 6
Japanese 6
English/Spanish 5
German 5
Turkish 5
Polish 3
Dutch 3
Marathi 3
Filipino 2
Thai 2
English/Japanese 2
English/Hindi 2
English/Mandarin 2
English/Korean 1
Khmer/English/French 1
English/Akan 1
Bengali 1
English/Swedish 1
English/Arabic 1
English/Taiwanese/Mandarin 1
Norwegian 1
Tamil 1
English/Ukranian/Russian 1
Spanish/Catalan 1
English/Russian 1
Georgian 1
Spanish/English 1
Swedish 1
Malay 1
Thia/English 1
Spanish/Basque 1
Name: language type, dtype: int64
6.3 30
5.8 30
6.4 28
7.1 28
6.5 26
6.7 25
6.1 24
6.8 24
7.3 21
7.2 20
5.7 20
5.2 19
5.5 19
7.0 19
6.9 19
6.6 18
6.2 18
5.9 16
5.6 15
5.4 13
6.0 13
7.4 12
7.5 10
7.6 10
5.3 10
4.6 8
7.7 8
4.8 7
4.7 6
4.4 6
5.1 6
5.0 5
8.2 5
4.1 4
4.9 4
7.9 4
4.5 4
8.1 3
4.3 3
7.8 3
8.4 3
3.7 2
8.3 2
4.2 2
2.6 2
9.0 1
8.0 1
3.2 1
3.9 1
3.5 1
8.6 1
3.4 1
8.5 1
2.5 1
Name: rating, dtype: int64
10/2/2020 6
11/1/2019 5
10/18/2019 5
12/7/2018 4
1/15/2021 4
..
12/3/2020 1
8/2/2019 1
6/16/2017 1
10/13/2020 1
3/1/2021 1
Name: Relase date, Length: 386, dtype: int64
97 24
98 19
94 19
95 18
100 17
..
45 1
25 1
54 1
51 1
13 1
Name: Duration, Length: 124, dtype: int64
2020 182
2019 125
2018 99
2021 71
2017 66
2016 30
2015 9
2014 1
Name: year, dtype: int64
data['year'].value_counts()
2020 182 2019 125 2018 99 2021 71 2017 66 2016 30 2015 9 2014 1 Name: year, dtype: int64
data.isnull().sum()
Name 0 classification 0 language type 0 rating 0 Relase date 0 Duration 0 year 0 dtype: int64
data.columns
Index(['Name', 'classification', 'language type', 'rating', 'Relase date',
'Duration', 'year'],
dtype='object')
#VISUALIZATION
plt.figure(figsize=(10,4))
sns.countplot(x='rating', data=data, color='cyan')
plt.xticks(rotation=90)
plt.show()
plt.bar(data['rating'],data['year'])
plt.xticks(rotation=90)
plt.show()
fig=px.violin(data,x='Name',y='classification',color='Name')
fig.show()
fig=px.bar(data,x='Duration',y='language type',color='language type')
fig.show()
sns.barplot(data['year'],data['language type'],color='r')
plt.xticks(rotation=90)
plt.show()
sns.lineplot(x='Duration', y='rating', data=data).set_title('')
Text(0.5, 1.0, '')
sns.displot(data["language type"])
plt.xticks(rotation=90)
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37], [Text(0, 0, 'Spanish'), Text(1, 0, 'French'), Text(2, 0, 'English'), Text(3, 0, 'Portuguese'), Text(4, 0, 'English/Mandarin'), Text(5, 0, 'English/Spanish'), Text(6, 0, 'German'), Text(7, 0, 'Italian'), Text(8, 0, 'Korean'), Text(9, 0, 'Thia/English'), Text(10, 0, 'Hindi'), Text(11, 0, 'Malay'), Text(12, 0, 'Japanese'), Text(13, 0, 'Marathi'), Text(14, 0, 'Swedish'), Text(15, 0, 'Indonesian'), Text(16, 0, 'Dutch'), Text(17, 0, 'Filipino'), Text(18, 0, 'Spanish/English'), Text(19, 0, 'English/Taiwanese/Mandarin'), Text(20, 0, 'Georgian'), Text(21, 0, 'English/Hindi'), Text(22, 0, 'English/Russian'), Text(23, 0, 'Spanish/Catalan'), Text(24, 0, 'English/Ukranian/Russian'), Text(25, 0, 'Tamil'), Text(26, 0, 'Norwegian'), Text(27, 0, 'Turkish'), Text(28, 0, 'English/Arabic'), Text(29, 0, 'Polish'), Text(30, 0, 'English/Swedish'), Text(31, 0, 'Bengali'), Text(32, 0, 'English/Japanese'), Text(33, 0, 'Thai'), Text(34, 0, 'English/Korean'), Text(35, 0, 'Khmer/English/French'), Text(36, 0, 'English/Akan'), Text(37, 0, 'Spanish/Basque')])
#MODEL BUILDING
X = data.drop(['year'], axis = 1)
y = data['year']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
X_train.shape, X_test.shape
((390, 6), (193, 6))
X_train.dtypes
Name object classification object language type object rating float64 Relase date object Duration int64 dtype: object
X_train.head()
| Name | classification | language type | rating | Relase date | Duration | |
|---|---|---|---|---|---|---|
| 552 | American Son | Drama | English | 5.8 | 11/1/2019 | 90 |
| 280 | Beats | Drama | English | 7.1 | 6/19/2019 | 110 |
| 234 | Nappily Ever After | Comedy-drama | English | 6.4 | 9/21/2018 | 98 |
| 255 | Eurovision Song Contest: The Story of Fire Saga | Musical comedy | English | 6.5 | 6/26/2020 | 123 |
| 438 | A Christmas Prince: The Royal Wedding | Romantic comedy | English | 5.3 | 11/30/2018 | 92 |
pip install category_encoders
Requirement already satisfied: category_encoders in d:\anaconda files\lib\site-packages (2.6.3)
Requirement already satisfied: numpy>=1.14.0 in d:\anaconda files\lib\site-packages (from category_encoders) (1.26.4)
Requirement already satisfied: scipy>=1.0.0 in d:\anaconda files\lib\site-packages (from category_encoders) (1.9.1)
Requirement already satisfied: pandas>=1.0.5 in d:\anaconda files\lib\site-packages (from category_encoders) (1.4.4)
Requirement already satisfied: statsmodels>=0.9.0 in d:\anaconda files\lib\site-packages (from category_encoders) (0.13.2)
Requirement already satisfied: scikit-learn>=0.20.0 in d:\anaconda files\lib\site-packages (from category_encoders) (1.0.2)
Requirement already satisfied: patsy>=0.5.1 in d:\anaconda files\lib\site-packages (from category_encoders) (0.5.2)
Requirement already satisfied: python-dateutil>=2.8.1 in d:\anaconda files\lib\site-packages (from pandas>=1.0.5->category_encoders) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in d:\anaconda files\lib\site-packages (from pandas>=1.0.5->category_encoders) (2022.1)
Requirement already satisfied: six in d:\anaconda files\lib\site-packages (from patsy>=0.5.1->category_encoders) (1.16.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in d:\anaconda files\lib\site-packages (from scikit-learn>=0.20.0->category_encoders) (2.2.0)
Requirement already satisfied: joblib>=0.11 in d:\anaconda files\lib\site-packages (from scikit-learn>=0.20.0->category_encoders) (1.1.0)
Collecting numpy>=1.14.0
Using cached numpy-1.24.4-cp39-cp39-win_amd64.whl (14.9 MB)
Requirement already satisfied: packaging>=21.3 in d:\anaconda files\lib\site-packages (from statsmodels>=0.9.0->category_encoders) (21.3)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in d:\anaconda files\lib\site-packages (from packaging>=21.3->statsmodels>=0.9.0->category_encoders) (3.0.9)
Installing collected packages: numpy
Attempting uninstall: numpy
Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
Successfully uninstalled numpy-1.26.4
Note: you may need to restart the kernel to use updated packages.
ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'D:\\anaconda files\\Lib\\site-packages\\~3mpy.libs\\libopenblas64__v0.3.23-293-gc2f4bdbb-gcc_10_3_0-2bde3a66a51006b2b53eb373ff767a3f.dll' Consider using the `--user` option or check the permissions.
import category_encoders as ce
encoder = ce.OrdinalEncoder(cols=['Name','classification','language type','rating','Relase date','Duration'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
X_train.head()
| Name | classification | language type | rating | Relase date | Duration | |
|---|---|---|---|---|---|---|
| 552 | 1 | 1 | 1 | 1 | 1 | 1 |
| 280 | 2 | 1 | 1 | 2 | 2 | 2 |
| 234 | 3 | 2 | 1 | 3 | 3 | 3 |
| 255 | 4 | 3 | 1 | 4 | 4 | 4 |
| 438 | 5 | 4 | 1 | 5 | 5 | 5 |
X_test.head()
| Name | classification | language type | rating | Relase date | Duration | |
|---|---|---|---|---|---|---|
| 355 | -1.0 | 1.0 | 1.0 | 20.0 | -1.0 | 40.0 |
| 407 | -1.0 | 18.0 | 1.0 | 42.0 | -1.0 | 32.0 |
| 90 | -1.0 | 5.0 | 1.0 | 2.0 | -1.0 | 20.0 |
| 402 | -1.0 | 44.0 | 1.0 | 28.0 | 34.0 | 61.0 |
| 268 | -1.0 | 5.0 | 17.0 | -1.0 | -1.0 | 32.0 |
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(random_state=60)
rfc.fit(X_train,y_train)
RandomForestClassifier(random_state=60)
y_pred=rfc.predict(X_test)
y_pred
array([2019, 2019, 2019, 2020, 2019, 2019, 2019, 2019, 2019, 2019, 2018,
2017, 2019, 2019, 2019, 2019, 2019, 2020, 2019, 2018, 2019, 2019,
2019, 2019, 2019, 2019, 2019, 2019, 2018, 2019, 2019, 2019, 2020,
2019, 2020, 2019, 2019, 2019, 2019, 2018, 2019, 2019, 2019, 2019,
2018, 2019, 2019, 2019, 2019, 2020, 2019, 2020, 2019, 2019, 2020,
2019, 2020, 2019, 2019, 2019, 2020, 2019, 2019, 2020, 2020, 2019,
2019, 2020, 2019, 2019, 2017, 2019, 2019, 2019, 2019, 2020, 2019,
2020, 2019, 2018, 2019, 2019, 2020, 2020, 2019, 2018, 2018, 2017,
2020, 2019, 2018, 2019, 2019, 2020, 2019, 2019, 2019, 2019, 2019,
2019, 2020, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
2019, 2020, 2019, 2019, 2020, 2021, 2019, 2019, 2019, 2019, 2019,
2018, 2018, 2019, 2017, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
2018, 2018, 2019, 2019, 2018, 2017, 2019, 2019, 2019, 2019, 2020,
2019, 2020, 2019, 2019, 2019, 2019, 2019, 2020, 2020, 2019, 2019,
2019, 2018, 2019, 2019, 2019, 2020, 2018, 2019, 2019, 2020, 2019,
2019, 2018, 2019, 2018, 2019, 2018, 2018, 2019, 2019, 2019, 2019,
2019, 2019, 2019, 2019, 2020, 2020, 2020, 2019, 2019, 2019, 2019,
2018, 2020, 2018, 2020, 2019, 2019], dtype=int64)
from sklearn.metrics import accuracy_score
print('model accuracy score with 10 decision-tree : {0:0.4f}' . format(accuracy_score(y_test, y_pred)))
model accuracy score with 10 decision-tree : 0.2850